
/*
***Data read - takes long, recommend continuing from the saved stata file ct600.dta:

qui use "H:\CT600\Refreshed data - May 2014\LN_CT600_2013REFRESH.dta", clear

*variable selection

qui drop   act charges_paid cta_source_ind cvs_inv_relief  ///
 interest_under_s468l man_exps_scha_loss_r mscr_r  ///
 non_corp_dist_to_profits_r  rf_ct_included  s501a_tax_charge ///
 section_747 section_4191  ///
tax_non_corp_dist_rate tax_underlying_rate_rem_profit_r tonnage_tax_profits 

compress

*number of non-missing observations for each variable

sum

rename taxpayer identifier

qui save "$(hmrc_datadir)\CT600\ct600.dta", replace

*/
set more off

cap log using "${hmrc_dir}\Log\dataprep_ct600.smcl", replace


***Cleaning to convert string variables into numeric for concatenating multiple accounts:

use "${hmrc_datadir}\CT600\ct600.dta", clear

*Destring the Accounting Period Variables

duplicates drop
gen ap_start_date2=accounting_period_start_date
label var  ap_start_date2 "Accounting period start date in original file"
gen ap_end_date2=accounting_period_end_date
label var  ap_end_date2 "Accounting period end date in original file"
rename accounting_period_start_date apsd
rename accounting_period_end_date aped

label var  apsd "Accounting period start date"
label var  aped "Accounting period end date"

local varlist  apsd aped

foreach var of varlist `varlist' {

	split `var', parse(-) destring
	drop `var'
	compress

	rename `var'1 year
	rename `var'2 month
	rename `var'3 day

	gen `var'= mdy(month, day, year)
	rename day day_`var'
	rename month month_`var'
	rename year year_`var'
	
	}
	
	
rename apsd accounting_period_start_date
rename aped accounting_period_end_date


*Financial year by Accounting Period End

gen day=day(accounting_period_end_date)
gen month=month(accounting_period_end_date)
gen year=year(accounting_period_end_date)

merge m:1 day month year using "${hmrc_datadir}\CT600\date_numerically.dta"

tab _merge 
qui drop if _merge!=3 
qui drop _merge

qui rename day end_day
qui rename month end_month
qui rename year end_year
qui rename date_numerically end_date_excel

qui gen fy=.
qui replace fy=2000 if end_date<=36981
qui replace fy=2001 if (36981<end_date) & (end_date<=37346)
qui replace fy=2002 if (37346<end_date) & (end_date<=37711)
qui replace fy=2003 if (37711<end_date) & (end_date<=38077)
qui replace fy=2004 if (38077<end_date) & (end_date<=38442)
qui replace fy=2005 if (38442<end_date) & (end_date<=38807)
qui replace fy=2006 if (38807<end_date) & (end_date<=39172)
qui replace fy=2007 if (39172<end_date) & (end_date<=39538)
qui replace fy=2008 if (39538<end_date) & (end_date<=39903)
qui replace fy=2009 if (39903<end_date) & (end_date<=40268)
qui replace fy=2010 if (40268<end_date) & (end_date<=40633)
qui replace fy=2011 if (40633<end_date) & (end_date<=40999)
qui replace fy=2012 if (40999<end_date) & (end_date<=41365)

qui label var fy "Financial Year "
qui label define fy_label 2000 "2000-01"
qui label define fy_label 2001 "2001-02", add
qui label define fy_label 2002 "2002-03", add
qui label define fy_label 2003 "2003-04", add
qui label define fy_label 2004 "2004-05", add
qui label define fy_label 2005 "2005-06", add
qui label define fy_label 2006 "2006-07", add
qui label define fy_label 2007 "2007-08", add
qui label define fy_label 2008 "2008-09", add
qui label define fy_label 2009 "2009-10", add
qui label define fy_label 2010 "2010-11", add
qui label define fy_label 2011 "2011-12", add
qui label define fy_label 2012 "2012-13", add

qui label values fy fy_label
qui rename fy year

*Numeric labeling of indicator variables
foreach type in sme large_co {
gen r_and_d_`type'_claim2=r_and_d_`type'_claim
replace r_and_d_`type'_claim2="1" if r_and_d_`type'_claim!=""
replace r_and_d_`type'_claim2="0" if r_and_d_`type'_claim2!="1"  
gen r_and_d_`type'_claim3=real(r_and_d_`type'_claim2)
tab r_and_d_`type'_claim r_and_d_`type'_claim3
drop r_and_d_`type'_claim2
rename r_and_d_`type'_claim3 r_and_d_`type'_claim2
label variable r_and_d_`type'_claim2 "Numeric identifier for `type' credit claim"
}

*Numeric labeling of indicator variables in R&D data:
gen instalment_payments2=1 if instalment_payments!="" & instalment_payments!="0"
replace instalment_payments2=0 if instalment_payments=="" | instalment_payments=="0" 
label variable instalment_payments2 "instalment payments indicator numeric values"

encode group_status_ind, generate(group_status_ind2)
label variable group_status_ind2 "group_status_ind numeric values"
drop group_status_ind

gen small_comp_marginal_rate2=1 if small_comp_marginal_rate=="X"
replace small_comp_marginal_rate2=0 if small_comp_marginal_rate!="X"
replace r_gci_loss_amt=abs(r_gci_loss_amt)
replace a_gci_loss_amt=abs(a_gci_loss_amt)

qui save "${hmrc_datadir}\CT600\ct600_dated", replace

***Dealing with multiple accounts:

use "${hmrc_datadir}\CT600\ct600_dated", clear

*First clean some unexpected negatives in the data:
foreach var of varlist gross_ct_chargeable_r ct_net_marg_rate_relief_r  ct_outstanding_r {
sum `var', d
}
foreach var of varlist gross_ct_chargeable_r ct_net_marg_rate_relief_r  ct_outstanding_r {
replace `var'=. if `var'<0
}

*Create separate files for companies with single accounts and companies with multiple accounts:

//*Identify companies with multiple accountings
qui egen multiples=count(identifier), by (identifier year)
gen ap_duration=accounting_period_end_date-accounting_period_start_date+1
label var ap_duration "Accounting period duration"
bysort multiples: sum ap_duration
bysort identifier year: egen total_duration=total(ap_duration)
//*save companies with multiple accounts within the same fiscal year to a different file*/
bysort identifier year: egen delete=max(multiples)
preserve
qui keep if delete>=2
qui save "${hmrc_datadir}\CT600\ct600_multipleaccts.dta", replace
restore
//*drop companies with multiple accounts within the same fiscal year*/
qui drop if delete>=2
qui drop delete
qui gen singleacct="single"
save "${hmrc_datadir}\CT600\ct600_singleaccts.dta", replace

//consolidate firms that filed multiple accounts per fiscal year
*companies with multiple accts of total duration less than 366: can simply add up
*companies with multiple accts of total duration more than 366: dropped

use "${hmrc_datadir}\CT600\ct600_multipleaccts.dta", clear
qui drop delete
qui sort identifier accounting_period_number
sum ap_duration total_duration
sum if total_duration<=366
sum if multiples>=2 & total_duration<=366
sum if multiples>=2 & total_duration > 366
qui sort identifier year accounting_period_number
drop if total_duration>366 & multiples>=2

rename trading_losses_brought_forward_r trading_losses_brought

*To avoid the - collapse - command from replacing missing with zeros, perform the following code:
//not an issue for the indicator variables so those are not included in this correction for missing values
bysort id year: egen seq = seq()
foreach var of varlist nontrade_capital_allowances group_relief franked_inv_income dtr trading_turnover amount_of_turnover_in_ap ///
 r_and_d_enhanced_exp vaccine_research_exp sme_claim_as_large_co r_and_d_repayable_r r_and_d_tax_credit_r ///
m_and_p_exp_qual_wda m_and_p_exp_long_life ///
m_and_p_exp_qual_fya film_tax_relief des_env_friendly ca_plant_amt ca_amt /// 
trading_profit_r trading_losses_brought trading_losses_case_i_r net_tax_payable_r ///
profits_chargeable_r net_trading_profit_r schedule_d_case_vi_r p_and_m_other_bcs ///
p_and_m_long_life_cas overseas_income nt_gains_intangibles nt_deficits d_and_r_nt_loss_intangibles d_and_r_trading_losses_r ///
ct_already_paid profits_before_chges ct_outstanding_r ct_overpaid_r income_tax_repayable_r r_and_d_repayable_r ///
aba_and_other_bcs aba_and_other_cas cars_bcs cars_cas iba_bcs iba_cas lde_nt_loss_intangibles ///
capital_allowances bcs_bpr cas_bpr cas_bpr_non_trade bcs_bpr_non_trade losses_b_f_s3938 losses_on_shares ///
net_non_trade_losses non_trade_loss_b_f nontrade_bcs nontrade_cas  ///
 nt_loan_profit other_annual_profits  p_and_m_long_life_bcs  ///
p_and_m_other_cas schedule_d_case_iii taxed_income ct_chargeable_r net_trading_profit_r gross_ct_chargeable_r ///
total_remedn_credit_r allowable_losses_r profits_r bal_amt bal_plant_amt a_gci_loss_amt a_gci_prof_amt r_gci_loss_amt r_gci_prof_amt ///
cas_aia_trade cas_aia_non_trade ca_fy_tax_credit_r m_and_p_exp_qual_fya m_and_p_exp_long_life m_and_p_exp_qual_wda ///
available_losses case_v sur_nt_deficits schedule_a sur_schedule_a overseas_property case_vi capital_losses sur_excess_nt_cas ///
sur_nt_loss_intang  total_dedns_reliefs s102_surrender   dtr fy1_amount_of_profit_entry_1 ///
group_relief {
	bysort id year: egen c`var'=count(`var') if (`var'==.) 
	replace c`var'=1 if (c`var'==0)
	bysort id year: egen c2`var'=count(`var')
	bysort id year: egen sum`var'=sum(`var')
	bysort id year: egen mn`var'=mean(`var')
	replace sum`var'=-99 if (mn`var'==.)
	replace sum`var'=. if (seq>1)
	replace sum`var'=-99 if (c2`var'!=0)
	drop c`var' c2`var' mn`var' `var'
	rename sum`var' `var'
}
drop seq

//collapse the multiple entries into a common period:
qui collapse (max)  multiples  r_and_d_sme_claim2 r_and_d_large_co_claim2 small_comp_marginal_rate2 ///
accounting_period_end_date  accounting_period_number cta_size_ind stratum_ind group_status_ind2 ap_no_of_assoc_companies /// 
underlying_rate_ct fy1_no_of_assoc_companies_r fy2_no_of_assoc_companies instalment_payments2 end_date ///
(sum)  nontrade_capital_allowances group_relief franked_inv_income dtr trading_turnover amount_of_turnover_in_ap ///
(sum)  r_and_d_enhanced_exp vaccine_research_exp sme_claim_as_large_co r_and_d_repayable_r r_and_d_tax_credit_r ///
m_and_p_exp_qual_wda m_and_p_exp_long_life ///
(min)  accounting_period_start_date trade_tcn ///
(sum) m_and_p_exp_qual_fya film_tax_relief des_env_friendly ca_plant_amt ca_amt /// 
trading_profit_r trading_losses_brought trading_losses_case_i_r net_tax_payable_r ///
profits_chargeable_r net_trading_profit_r schedule_d_case_vi_r p_and_m_other_bcs ///
p_and_m_long_life_cas overseas_income nt_gains_intangibles nt_deficits d_and_r_nt_loss_intangibles d_and_r_trading_losses_r ///
ct_already_paid profits_before_chges ct_outstanding_r ct_overpaid_r income_tax_repayable_r ///
aba_and_other_bcs aba_and_other_cas cars_bcs cars_cas iba_bcs iba_cas lde_nt_loss_intangibles ///
capital_allowances bcs_bpr cas_bpr cas_bpr_non_trade bcs_bpr_non_trade losses_b_f_s3938 losses_on_shares ///
net_non_trade_losses non_trade_loss_b_f nontrade_bcs nontrade_cas  /// 
nt_loan_profit other_annual_profits  p_and_m_long_life_bcs  ///
p_and_m_other_cas schedule_d_case_iii taxed_income ct_chargeable_r gross_ct_chargeable_r ///
total_remedn_credit_r allowable_losses_r profits_r bal_amt bal_plant_amt a_gci_loss_amt a_gci_prof_amt r_gci_loss_amt r_gci_prof_amt ///
cas_aia_trade cas_aia_non_trade ca_fy_tax_credit_r ///
available_losses case_v sur_nt_deficits schedule_a sur_schedule_a overseas_property case_vi capital_losses sur_excess_nt_cas ///
sur_nt_loss_intang  total_dedns_reliefs s102_surrender    fy1_amount_of_profit_entry_1 ///
, by(identifier year)

label variable group_status_ind2 "1 if N; 2 if P"

foreach var of varlist nontrade_capital_allowances group_relief franked_inv_income dtr trading_turnover amount_of_turnover_in_ap ///
 r_and_d_enhanced_exp vaccine_research_exp sme_claim_as_large_co r_and_d_repayable_r r_and_d_tax_credit_r ///
m_and_p_exp_qual_wda m_and_p_exp_long_life ///
m_and_p_exp_qual_fya film_tax_relief des_env_friendly ca_plant_amt ca_amt /// 
trading_profit_r trading_losses_brought trading_losses_case_i_r net_tax_payable_r ///
profits_chargeable_r net_trading_profit_r schedule_d_case_vi_r p_and_m_other_bcs ///
p_and_m_long_life_cas overseas_income nt_gains_intangibles nt_deficits d_and_r_nt_loss_intangibles d_and_r_trading_losses_r ///
ct_already_paid profits_before_chges ct_outstanding_r ct_overpaid_r income_tax_repayable_r r_and_d_repayable_r ///
aba_and_other_bcs aba_and_other_cas cars_bcs cars_cas iba_bcs iba_cas lde_nt_loss_intangibles ///
capital_allowances bcs_bpr cas_bpr cas_bpr_non_trade bcs_bpr_non_trade losses_b_f_s3938 losses_on_shares ///
net_non_trade_losses non_trade_loss_b_f nontrade_bcs nontrade_cas  /// 
nt_loan_profit other_annual_profits overseas_income p_and_m_long_life_bcs  ///
p_and_m_other_cas schedule_d_case_iii taxed_income ct_chargeable_r net_trading_profit_r gross_ct_chargeable_r ///
total_remedn_credit_r allowable_losses_r profits_r bal_amt bal_plant_amt a_gci_loss_amt a_gci_prof_amt r_gci_loss_amt r_gci_prof_amt ///
cas_aia_trade cas_aia_non_trade ca_fy_tax_credit_r m_and_p_exp_qual_fya m_and_p_exp_long_life m_and_p_exp_qual_wda ///
available_losses case_v sur_nt_deficits schedule_a sur_schedule_a overseas_property case_vi capital_losses sur_excess_nt_cas ///
sur_nt_loss_intang  total_dedns_reliefs s102_surrender   dtr fy1_amount_of_profit_entry_1 ///
group_relief {
	replace `var'=. if (`var'<0)
}
rename trading_losses_brought trading_losses_brought_forward_r

qui gen singleacct="Collapsed"
qui save "${hmrc_datadir}\CT600\ct600_multipleaccts2merge.dta", replace
/*Consolidating companies with multiple accounts*/
qui use "${hmrc_datadir}\CT600\ct600_singleaccts.dta", clear
qui append using "${hmrc_datadir}\CT600\ct600_multipleaccts2merge.dta"
sum
qui drop multiples
qui bysort identifier year: gen multiples=_N
tab multiples
tab singleacct
replace r_gci_loss_amt=abs(r_gci_loss_amt)
replace a_gci_loss_amt=abs(a_gci_loss_amt)
qui save "${hmrc_datadir}\CT600\ct600_dated_collapsed", replace




***drop multiple id in Fame
forval yyyy=2000/2012 {
use "${hmrc_datadir}\FAME\fame`yyyy'", clear
qui bysort identifier: gen multi=_N
qui drop if multi>=2
qui drop multi
qui save "${hmrc_datadir}\FAME\fame`yyyy'_dropped", replace
}

***Match with FAME

use "${hmrc_datadir}\CT600\ct600_dated_collapsed", clear

*** Match FAME with CT600 ***
forval yyyy=2000/2012 {
merge 1:1 identifier year using "${hmrc_datadir}\FAME\fame`yyyy'_dropped", update
tab _merge
qui drop if _merge==2
rename _merge _merge_fame_`yyyy'
}
gen _merge_CT600FAME=_merge_fame_2000
forval yyyy=2001/2012 {
replace _merge_CT600FAME=_merge_fame_`yyyy' if _merge_CT600FAME==1 
}
drop _merge_fame_*
tab _merge_CT600FAME

save "${hmrc_datadir}\CT600\ct600_fame_matched", replace

use "${hmrc_datadir}\CT600\ct600_fame_matched", clear

*Checking Matching Quality
ren trading_turnover turnover_ct600
gen turnover_diff=turnover_ct600 - turnover
sum turnover_diff if turnover!=.

qui count if turnover!=.
qui scalar turnover_fame=r(N)

qui count if turnover_diff<=1000 & turnover_diff >=-1000
qui scalar turnover_delta=r(N)

qui scalar match_ok=(turnover_delta/turnover_fame)*100
sca di match_ok

qui drop turnover_diff

*12-month account in both FAME and ct600
qui count 
qui scalar obs_total=r(N)
qui count if no_of_months==12
qui scalar obs_12=r(N)
qui scalar month_12=obs_12/obs_total
sca di month_12

*unconsolidated accounts only
qui count 
qui scalar obs_total=r(N)
qui count if cons_code=="U"
qui scalar obs_uncons=r(N)
qui scalar uncons=obs_uncons/obs_total
sca di uncons

//here make a call about whether to remove consolidated accounts:
qui drop if cons_code=="C"
qui drop cons_code

sum 

*cost of sales in fame
qui replace cost_of_sales=-cost_of_sales

*stand-alone companies

qui count 
qui scalar obs_total=r(N)
qui count if ap_no_of_assoc_companies>0 &  ap_no_of_assoc_companies!=.
qui scalar obs_group=r(N)
qui scalar group=obs_group/obs_total
sca di group 

/*
qui drop if ap_no_of_assoc_companies>0 &  ap_no_of_assoc_companies!=.
qui drop ap_no_of_assoc_companies

qui drop if group>0 & group!=.
qui drop group
*/


qui ren  primary_uk_sic_code sic_primary

*Date of incorporation
gen date=substr(incorporation_date, 1, 9)
gen incorp_date=date(date, "DMY")
qui drop date incorporation_date

*Company type

encode company_type, gen (type)
tab type

drop company_type type

compress

sum

foreach var of varlist fixed_assets overseas_turnover research_and_development total_assets turnover uk_turnover incorp_date {
replace `var'=. if `var'<0
}

*rename and labeling variables*
qui ren accounting_period_number ap_number
qui ren trading_turnover turnover
qui ren profits_chargeable_r profits_chargeable
qui ren accounting_period_start_date ap_start_date
qui ren accounting_period_end_date ap_end_date
qui ren group_status_ind group_status
qui ren trading_profit_r trading_profit

label var identifier "id"
label var ap_number "Accounting period number"
label var turnover "Trading turnover"
label var profits_chargeable "Taxable corporate profit"
label var ap_start_date "Accounting period start date"
label var ap_end_date "Accounting period end date"
label var group_status "Group status"
label var group_relief "Group relief"
label var overseas_income "Overseas income"
label var trade_tcn "Trade tcn"
label var trading_profit "Tading profit"

qui save "${hmrc_datadir}\CT600\ct600_fame", replace

*save a dataset with id only to be merged with ownership data*
preserve
keep id
duplicates drop 
save "${hmrc_dir}\Data\CT600\ct600_id.dta", replace
restore

qui save "${hmrc_datadir}\FAME\ct600_fame", replace



log off
exit
